After putting it off for a long time, I've finally started to use Theano for our work here at Stanford. The initial reason for doing this was my wanting to run experiments on the GPUs at our disposal. Eventually using Theano started to simply seem like more and more of a good idea because its automatic differention drastically shortened development time and debug time (in my experience, anyway).
In this tutorial, we will make a simple feed forward neural network using Theano to do regression.
In [23]:
import theano
from theano import tensor as T
import numpy as np
import pylab as P
%matplotlib inline
In [24]:
W = np.arange(6).reshape(3,2)/10.
X = np.arange(9).reshape(3,3)/10. - 0.5
b = np.arange(2) - 1
print 'X'
print X
print 'W'
print W
print 'b'
print b
In [25]:
print 'XW + b'
print np.dot(X, W) + b
print 'tanh(XW + b)'
print np.tanh(np.dot(X, W)+b)
We can do the identical calculation in Theano:
In [28]:
# Declare varaibles
var_W = theano.shared(value=W)
var_b = theano.shared(value=b)
var_X = T.fmatrix()
# Construct computation graph
output = T.tanh(T.dot(var_X, var_W) + var_b)
# Expose paramters in graph through function
fn = theano.function(inputs=[var_X], outputs=output, allow_input_downcast=True)
# Compute
print fn(X)
In [69]:
class Layer:
num_layers = 0
def __init__(self, num_in, num_out, X, activation=None):
W_init, b_init = self.__class__.get_init(num_in, num_out)
self.name = str(self.__class__.num_layers)
self.X = X
self.W = theano.shared(value=W_init, name=self.name + '.W')
self.b = theano.shared(value=b_init, name=self.name + '.b')
temp = T.dot(self.X, self.W) + self.b
self.output = activation(temp) if activation else temp
self.__class__.num_layers += 1
@classmethod
def get_init(cls, num_in, num_out):
bound = 6. / np.sqrt(num_in + num_out)
W_init = np.random.uniform(low=-bound, high=bound, size=(num_in, num_out))
b_init = np.zeros(num_out)
return W_init, b_init
For example, we can instantiate a simple layer as follows:
In [70]:
layer = Layer(1, 1, X=T.fmatrix())
layer_fn = theano.function(
inputs=[layer.X],
outputs=layer.output,
allow_input_downcast=True
)
x_axis = np.arange(-20, 20, 1.).reshape(-1,1)
out = layer_fn(x_axis)
P.figure()
P.plot(x_axis, out, 'g')
Out[70]:
Next, we will actually train this layer to do something useful. A neat thing about Theano is that is supports automatic differentiation on the computation graph.
In [71]:
X = T.fmatrix()
layer = Layer(1, 1, X, activation=T.tanh)
layer_fn = theano.function(
inputs=[layer.X],
outputs=layer.output,
allow_input_downcast=True
)
lr = 0.5
x_axis = np.arange(-3, 0, 0.1).reshape(-1,1)
y_target = np.cos(x_axis).reshape(-1,1)
Y = T.fmatrix()
Loss = T.mean((layer.output - Y)**2)
train_layer_fn = theano.function(
inputs=[X, Y],
updates=[(layer.W, layer.W - lr * T.grad(Loss, wrt=layer.W)), (layer.b, layer.b - lr * T.grad(Loss, wrt=layer.b))],
outputs=[Loss, layer.output],
allow_input_downcast=True
)
for i in range(100):
loss, guess = train_layer_fn(x_axis, y_target)
if i % 20 == 0:
P.figure()
P.plot(x_axis, y_target, 'g')
P.plot(x_axis, guess, 'r')
print 'iter', i, 'loss', loss
In [82]:
class Net:
def __init__(self):
self.X = T.fmatrix()
self.layers = []
def add_layer(self, n_in, n_out, activation=None):
X = self.layers[-1].output if self.layers else self.X
layer = Layer(n_in, n_out, X, activation)
self.layers.append(layer)
def compile_train(self):
Y = T.fmatrix(name='Y')
net_loss = T.mean((Y - self.layers[-1].output)**2)
lr = T.fscalar(name='learning_rate')
updates = []
for layer in self.layers:
updates += [
(layer.W, layer.W - lr * T.grad(net_loss, wrt=layer.W)),
(layer.b, layer.b - lr * T.grad(net_loss, wrt=layer.b)),
]
return theano.function(
inputs=[self.X, Y, lr],
updates=updates,
outputs=[net_loss, self.layers[-1].output],
allow_input_downcast=True
)
net = Net()
net.add_layer(1, 3, T.tanh)
net.add_layer(3, 1)
train = net.compile_train()
x_axis = np.arange(-3, 3, 0.1).reshape(-1,1)
y_target = np.cos(x_axis).reshape(-1,1)
for i in range(1000):
loss, guess = train(x_axis, y_target, 0.1)
if i % 200 == 0:
P.figure()
P.plot(x_axis, y_target, 'g')
P.plot(x_axis, guess, 'r')
print 'iter', i, 'loss', loss
And that's it! Without any attempt at optimizing the structure of the code, we ended up with very simple abstractions for Layer and for Net, runnable on the GPU via Theano.
In [ ]: